/*
 * friso main file implemented the friso main functions.
 * 		starts with friso_ in the friso header file "friso.h";
 * 
 * @author	chenxin <chenxin619315@gmail.com>
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#include "friso_API.h"
#include "friso_ctype.h"
#include "friso.h"

/* {{{ create a new friso configuration variable.
*/
FRISO_API friso_t friso_new( void ) 
{
	friso_t e = ( friso_t ) FRISO_MALLOC( sizeof( friso_entry ) );
	if ( e == NULL ) {
		___ALLOCATION_ERROR___
	} 

	e->dic	= NULL;
	e->charset	= FRISO_UTF8;	//set default charset UTF8.

	return e;
}
/* }}} */

/* {{{ creat a new friso with initialize item from a configuration file.
 * 
 * @return 1 for successfully and 0 for failed.
 */
FRISO_API int friso_init_from_ifile( 
		friso_t friso, friso_config_t config, fstring __ifile ) 
{
	FILE *__stream;
	char __chars__[256], __key__[128], *__line__, __lexi__[128];
	uint_t i, t, __hit__ = 0, __length__;

	char *slimiter = NULL;
	uint_t flen = 0;

	//get the base part of the path of the __ifile
	if ( (slimiter = strrchr(__ifile, '/')) != NULL )
	{
		flen = slimiter - __ifile + 1;
	}

	//yat, start to parse the friso.ini configuration file
	if ( ( __stream = fopen( __ifile, "rb" ) ) != NULL ) 
	{
		//initialize the entry with the value from the ifile.
		while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL ) 
		{
			//comments filter.
			if ( __line__[0] == '#' ) continue;
			if ( __line__[0] == '\t' ) continue; 
			if ( __line__[0] == ' ' || __line__[0] == '\0' ) continue;

			__length__ = strlen( __line__ );
			for ( i = 0; i < __length__; i++ ) {
				if ( __line__[i] == ' ' 
						|| __line__[i] == '\t' || __line__[i] == '=' ) break;
				__key__[i] = __line__[i];
			}
			__key__[i] = '\0';

			//position the euqals char '='.
			if ( __line__[i] == ' ' || __line__[i] == '\t' ) {
				for ( i++ ; i < __length__; i++ ) 
					if ( __line__[i] == '=' ) break; 
			} 

			//clear the left whitespace of the value.
			for ( i++; i < __length__ 
					&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
			for ( t = 0; i < __length__; i++, t++ ) {
				if ( __line__[i] == ' ' || __line__[i] == '\t' ) break;
				__line__[t] = __line__[i]; 
			} 
			__line__[t] = '\0';

			//printf("key=%s, value=%s\n", __key__, __line__ );
			if ( strcmp( __key__, "friso.lex_dir" ) == 0 ) 
			{
				/*
				 * here copy the value of the lex_dir.
				 *		cause we need the value of friso.max_len to finish all
				 *	the work when we call function friso_dic_load_from_ifile to
				 *	initiliaze the friso dictionary.
				 */
				if ( __hit__ == 0 ) 
				{
					__hit__ = t;
					for ( t = 0; t < __hit__; t++ ) {
						__lexi__[t] = __line__[t];
					}
					__lexi__[t] = '\0';
				} 
			} else if ( strcmp( __key__, "friso.max_len" ) == 0 ) {
				config->max_len = ( ushort_t ) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.r_name" ) == 0 ) {
				config->r_name = ( ushort_t ) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.mix_len" ) == 0 ) {
				config->mix_len = ( ushort_t ) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.lna_len" ) == 0 ) {
				config->lna_len = ( ushort_t ) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.add_syn" ) == 0 ) {
				config->add_syn = ( ushort_t ) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.clr_stw" ) == 0 ) {
				config->clr_stw = ( ushort_t ) atoi( __line__ );	
			} else if ( strcmp( __key__, "friso.keep_urec" ) == 0 ) {
				config->keep_urec = ( uint_t ) atoi( __line__ );	
			} else if ( strcmp( __key__, "friso.spx_out" ) == 0 ) {
				config->spx_out = ( ushort_t ) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.nthreshold" ) == 0 ) {
				config->nthreshold = atoi( __line__ );
			} else if ( strcmp( __key__, "friso.mode" ) == 0 ) {
				config->mode = ( friso_mode_t ) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.charset" ) == 0 ) {
				friso->charset = (friso_charset_t) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.en_sseg") == 0 ) {
				config->en_sseg = (ushort_t) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.st_minl") == 0 ) {
				config->st_minl = (ushort_t) atoi( __line__ );
			} else if ( strcmp( __key__, "friso.kpuncs") == 0 ) {
				//t is the length of the __line__.
				memcpy(config->kpuncs, __line__, t);
				//printf("friso_init_from_ifile#kpuncs: %s\n", config->kpuncs);
			}
		}

		/*
		 * intialize the friso dictionary here.
		 *		use the setting from the ifile parse above
		 *	we copied the value in the __lexi__
		 */
		if ( __hit__ != 0 ) 
		{
			//add relative path search support
			//@added: 2014-05-24
			//convert the relative path to absolute path base on the path of friso.ini
#ifdef FRISO_WINNT
			if ( __lexi__[1] != ':' && flen != 0 )
			{
				memcpy(__lexi__ + flen, __lexi__, __hit__);
				memcpy(__lexi__, __ifile, flen);
				//count the new length
				flen = flen + __hit__;
				if ( __lexi__[flen-1] != '/'  ) __lexi__[flen] = '/';
				__lexi__[flen+1] = '\0';
			}
#else
			if ( __lexi__[0] != '/' && flen != 0 )
			{
				memcpy(__lexi__ + flen, __lexi__, __hit__);
				memcpy(__lexi__, __ifile, flen);
				//count the new length
				flen = flen + __hit__;
				if ( __lexi__[flen-1] != '/'  ) __lexi__[flen] = '/';
				__lexi__[flen+1] = '\0';
			}
#endif

			friso->dic = friso_dic_new();
			//add charset check for max word length counting
			friso_dic_load_from_ifile( friso, config, 
					__lexi__, config->max_len * (friso->charset == FRISO_UTF8 ? 3 : 2) );
		}

		fclose( __stream );
		return 1;
	}

	return 0;
}
/* }}} */

/* {{{ friso free functions.
 * here we have to free its dictionary.
 */
FRISO_API void friso_free( friso_t friso ) 
{
	//free the dictionary
	if ( friso->dic != NULL ) {
		friso_dic_free( friso->dic );
	}
	FRISO_FREE( friso );
}
/* }}} */

/* {{{ create a new friso configuration entry and initialize 
 * it with default value.*/
FRISO_API friso_config_t friso_new_config( void )
{
	friso_config_t cfg = (friso_config_t) 
		FRISO_MALLOC(sizeof(friso_config_entry));
	if ( cfg == NULL ) {
		___ALLOCATION_ERROR___;
	}

	//initialize the configuration entry.
	friso_init_config(cfg);

	return cfg;
}
/* }}} */

/* {{{ initialize the specified friso config entry with default value.*/
FRISO_API void friso_init_config( friso_config_t cfg )
{
	cfg->max_len 	= DEFAULT_SEGMENT_LENGTH;
	cfg->r_name 	= 1;
	cfg->mix_len 	= DEFAULT_MIX_LENGTH;
	cfg->lna_len 	= DEFAULT_LNA_LENGTH;
	cfg->add_syn	= 1;
	cfg->clr_stw	= 0;
	cfg->keep_urec	= 0;
	cfg->spx_out	= 0;
	cfg->en_sseg	= 1;	//default start the secondary segmentaion.
	cfg->st_minl	= 1;	//min length for secondary split sub token.
	cfg->nthreshold 	= DEFAULT_NTHRESHOLD;
	cfg->mode 		= ( friso_mode_t ) DEFAULT_SEGMENT_MODE;

	//Zero fill the kpuncs buffer.
	memset(cfg->kpuncs, 0x00, sizeof(cfg->kpuncs));
}
/* }}} */

/* {{{ create a new segment task entry.
*/
FRISO_API friso_task_t friso_new_task() 
{
	friso_task_t task = ( friso_task_t ) FRISO_MALLOC( sizeof( friso_task_entry ) );
	if ( task == NULL ) {
		___ALLOCATION_ERROR___
	}

	//initliaze the segment.
	task->text 	= NULL;
	task->idx 	= 0;
	task->length = 0;
	task->bytes = 0;
	task->unicode = 0;
	task->ctrlMask = 0;
	task->pool	= new_link_list();
	task->sbuf	= new_string_buffer();
	task->hits = friso_new_hits();

	return task;
}
/* }}} */

/* {{{ free the specified task*/
FRISO_API void friso_free_task( friso_task_t task ) 
{
	//free the allocation of the poll link list.
	if ( task->pool != NULL ) {
		free_link_list( task->pool );
	}

	//release the allocation of the sbuff string_buffer_t.
	if ( task->sbuf != NULL ) {
		free_string_buffer(task->sbuf);
	}

	//free the allocations of the hits.
	if ( task->hits != NULL ) {
		friso_free_hits( task->hits );
	}

	FRISO_FREE( task );
}
/* }}} */

/* {{{ create a new friso hits */
FRISO_API friso_hits_t friso_new_hits( void ) 
{
	friso_hits_t hits = ( friso_hits_t ) 
		FRISO_MALLOC( sizeof( friso_hits_entry ) );
	if ( hits == NULL ) {
		___ALLOCATION_ERROR___
	}

	//initialize
	hits->type = (uchar_t) __LEX_OTHER_WORDS__;
	hits->length = 0;
	hits->rlen = 0;
	hits->pos = '\0';
	hits->offset = -1;
	memset(hits->word, 0x00, __HITS_WORD_LENGTH__);

	return hits;
}
/* }}} */

/* {{{ set the text of the current segmentation.
 *		that means we could re-use the segment.
 *	also we have to reset the idx and the length of the segmentation.
 * and the most important one - clear the poll link list.
 */
FRISO_API void friso_set_text( 
		friso_task_t task, fstring text ) 
{
	task->text = text;
	task->idx = 0;									//reset the index
	task->length = strlen( text );
	task->pool = link_list_clear( task->pool );		//clear the word poll
	string_buffer_clear( task->sbuf );				//crear the string buffer.
}
/* }}} */

/* **************************************************************
 * the static functions:					*
 * 		to assist the friso_next finish the work.	*
 ****************************************************************/
/* {{{ read the next word from the current position.
 * 
 * @return	int the bytes of the readed word.
 */
__STATIC_API__ uint_t readNextWord( 
		friso_t friso,			//friso instance
		friso_task_t task, 		//token task
		uint_t *idx, 			//current index.
		fstring __word ) 		//work buffer.
{
	if ( friso->charset == FRISO_UTF8 )
		//@reader: task->unicode = get_utf8_unicode(task->buffer) is moved insite
		//	function utf8_next_word from friso 1.6.0 .
		return utf8_next_word( task, idx, __word );
	else if ( friso->charset == FRISO_GBK )
		return gbk_next_word( task, idx, __word );

	return 0;		//unknow charset.
}
/* }}} */

/* {{{ get the next cjk word from the current position, with simple mode.
*/
__STATIC_API__ lex_entry_t next_simple_cjk( 
		friso_t friso, 
		friso_config_t config,
		friso_task_t task ) 
{
	uint_t t, idx = task->idx, __length__;
	string_buffer_t sb = new_string_buffer_with_string( task->buffer );
	lex_entry_t e = friso_dic_get( friso->dic, 
			__LEX_CJK_WORDS__, sb->buffer );

	/*
	 * here bak the e->length in the task->hits->type.
	 *		we will use it to count the task->idx.
	 * for the sake of use less variable.
	 */
	__length__ = e->length;

	for ( t = 1; t < config->max_len 
			&& ( task->bytes = readNextWord( 
					friso, task, &idx, task->buffer ) ) != 0; t++ ) 
	{
		if ( friso_whitespace( friso->charset, task ) ) break;
		if ( ! friso_cn_string( friso->charset, task ) ) break;

		string_buffer_append( sb, task->buffer );

		//check the existence of the word by search the dictionary.
		if ( friso_dic_match( friso->dic, 
					__LEX_CJK_WORDS__, sb->buffer ) ) {
			e = friso_dic_get( friso->dic, 
					__LEX_CJK_WORDS__, sb->buffer );
		}
	}

	//correct the offset of the segment.
	task->idx += ( e->length - __length__ );
	free_string_buffer( sb );							//free the buffer

	/*
	 * check the stopwords dictionary,
	 * 	make sure the current tokenzier is not stopwords.
	 * @warning: friso.clr_stw must be open in friso.ini configuration file.
	 */
	if ( config->clr_stw 
			&& friso_dic_match( friso->dic, 
				__LEX_STOPWORDS__, e->word ) ) {
		return NULL;	
	}

	return e;
}
/* }}} */

/* {{{ basic latin segmentation*/
/*convert full-width char  to half-width*/ 
#define convert_full_to_half( friso, task, convert ) \
	do {\
		if ( friso_fullwidth_en_char( friso->charset, task ) ) {	\
			if ( friso->charset == FRISO_UTF8 ) 	\
			task->unicode -= 65248;					\
			else if ( friso->charset == FRISO_GBK ) 	\
			{\
				task->buffer[0] = ((uchar_t)task->buffer[1]) - 128;	\
				task->buffer[1] = '\0';					\
			}\
			convert = 1;				\
		}												\
	} while (0)


/*convert uppercase char to lowercase char*/
#define convert_upper_to_lower( friso, task, convert ) \
	do {\
		if ( friso_uppercase_letter( friso->charset, task ) ) {		\
			if ( friso->charset == FRISO_UTF8 )		\
			task->unicode += 32;			\
			/* With the above logic(full to half), 					
			 * here we just need to check half-width*/	\
			else if ( friso->charset == FRISO_GBK )	\
				task->buffer[0] = task->buffer[0] + 32;	\
					convert = 1;			\
		}												\
	} while (0)

/* convert the unicode to utf-8 bytes. (FRISO_UTF8) */
#define convert_work_apply( friso, task, convert ) \
	do {\
		if ( convert == 1 && friso->charset == FRISO_UTF8 ) {	\
			memset( task->buffer, 0x00, 7 );			\
			unicode_to_utf8( task->unicode, task->buffer );	\
			convert = 0;					\
		} \
	} while ( 0 )


//get the next latin word from the current position.
__STATIC_API__ lex_entry_t next_basic_latin( 
		friso_t friso, 
		friso_config_t config,
		friso_task_t task ) 
{
	int __convert = 0, t = 0;
	int chkecm = 0, chkunits = 1, wspace = 0;

	/* cause friso will convert full-width numeric and letters
	 * 	(Not punctuations) to half-width ones. so, here we need
	 * wlen to record the real length of the lex_entry_t.
	 * */
	uint_t wlen = task->bytes;
	uint_t idx = task->idx;
	string_buffer_t sb, tmp = NULL;
	lex_entry_t e = NULL;

	//condition controller to start the secondary segmente.
	int ssseg = 0;
	int fdunits = 0;

	//secondray segmente.
	int tcount = 1;			//number fo different type of char.
	friso_enchar_t _ctype, _TYPE;
	task_ssseg_close(task);

	//full-half width and upper-lower case exchange.
	convert_full_to_half( friso, task, __convert );
	convert_upper_to_lower( friso, task, __convert );
	convert_work_apply( friso, task, __convert );

	//creat a new fstring buffer and append the task->buffer insite.
	sb = new_string_buffer_with_string( task->buffer );
	_TYPE = friso_enchar_type( friso->charset, task );

	//segmentation.
	while ( ( task->bytes = readNextWord( 
					friso, task, &idx, task->buffer ) ) != 0 ) 
	{
		//convert full-width to half-width.
		convert_full_to_half(friso, task, __convert);
		_ctype = friso_enchar_type( friso->charset, task );

		if ( _ctype == FRISO_EN_WHITESPACE )
		{
			wspace = 1;
			break;
		}

		if ( _ctype == FRISO_EN_PUNCTUATION )
		{
			//clear the full-width punctuations.
			if ( task->bytes > 1 ) break;
			if ( ! friso_en_kpunc( config, task->buffer[0] ) ) break;
		}

		/* check if is an FRISO_EN_NUMERIC, or FRISO_EN_LETTER.
		 * 	here just need to make sure it is not FRISO_EN_UNKNOW.
		 * */
		if ( _ctype == FRISO_EN_UNKNOW )
		{
			if ( friso_cn_string( friso->charset, task ) ) chkecm = 1;
			break;
		}

		//upper-lower case convert
		convert_upper_to_lower( friso, task, __convert );
		convert_work_apply( friso, task, __convert );
		string_buffer_append( sb, task->buffer );
		wlen += task->bytes;
		task->idx += task->bytes;

		/* Char type counter.
		 * 	make the condition to start the secondary segmentation.
		 *
		 * @TODO: 2013-12-22
		 * */
		if ( _ctype != _TYPE )
		{
			tcount++;
			_TYPE = _ctype;
		}
	}

	/*
	 * 1. clear the useless english punctuation 
	 * 		from the end of the buffer.
	 * 2. check the english and punctuation mixed word.
	 *
	 * set _ctype to as the status for the existence of punctuation
	 * 	at the end of the sb cause we need to plus the tcount
	 * 	to avoid the secondary check for work like 'c+', 'chenxin.'.
	 */
	_ctype = 0;
	for ( ; sb->length > 0 
			&& sb->buffer[ sb->length - 1 ] != '%' 
			&& is_en_punctuation( 
				friso->charset, sb->buffer[ sb->length - 1 ] ); ) 
	{
		//check the english punctuation mixed word.
		if ( friso_dic_match( friso->dic, 
					__LEX_ENPUN_WORDS__, sb->buffer  ) ) {
			e = friso_dic_get(friso->dic, 
					__LEX_ENPUN_WORDS__, sb->buffer);
			chkunits = 0;
			break;
		}

		//mark the end of the buffer.
		sb->buffer[ --sb->length ] = '\0';
		wlen--;
		task->idx--;

		/*check and plus the tcount*/
		if ( _ctype == 0 )
		{
			tcount--;
			_ctype = 1;
		}
	}

	//check the condition to start the secondary segmentation.
	ssseg = (tcount > 1) && (chkunits == 1);

	//check the tokenize loop is break by whitespace.
	//	no need for all the following work if it is. 
	//@added 2013-11-19
	if ( wspace == 1 || task->idx == task->length ) {
		e = new_lex_entry( string_buffer_devote(sb), 
				NULL, 0, sb->length, __LEX_OTHER_WORDS__ );
		e->rlen = wlen;
		//set the secondary mask.
		if ( ssseg ) task_ssseg_open(task);
		return e;
	}

	if ( chkecm != 1 ) {
		/*
		 * check the single words unit.
		 * 	not only the chinese word but also other kinds of word.
		 * so we can recongnize the complex unit like '℉,℃'' eg..
		 * @date 2013-10-14
		 */
		if ( chkunits 
				&& ( friso_numeric_string( friso->charset, sb->buffer ) 
					|| friso_decimal_string( friso->charset, sb->buffer ) ) ) 
		{
			idx = task->idx;
			if ( ( task->bytes = readNextWord(
							friso, task, &idx, task->buffer ) ) != 0 ) 
			{
				//check the EC dictionary.
				if ( friso_dic_match( friso->dic, 
							__LEX_CJK_UNITS__, task->buffer ) )
				{
					fdunits = 1;
					string_buffer_append(sb, task->buffer);
					wlen += task->bytes;
					task->idx += task->bytes;
				}
			}
		} 

		//set the START_SS_MASK
		if ( fdunits != 1 && ssseg ) task_ssseg_open(task);

		//creat the lexicon entry and return it.
		e = new_lex_entry( string_buffer_devote(sb), 
				NULL, 0, sb->length, __LEX_OTHER_WORDS__ );
		e->rlen = wlen;

		return e;
	}


	//Try to find a english chinese mixed word.
	tmp = new_string_buffer_with_string( sb->buffer );
	idx = task->idx;
	for ( t = 0; t < config->mix_len 
			&& ( task->bytes = readNextWord( 
					friso, task, &idx, task->buffer ) ) != 0; t++ ) 
	{
		//if ( ! friso_cn_string( friso->charset, task ) ) {
		//	task->idx -= task->bytes;
		//	break;
		//}
		//replace with the whitespace check.
		//more complex mixed words could be find here. 
		// (no only english and chinese mix word)
		//@date 2013-10-14
		if ( friso_whitespace( friso->charset, task ) ) break;

		string_buffer_append( tmp, task->buffer );

		//check the mixed word dictionary.
		if ( friso_dic_match( friso->dic, 
					__LEX_ECM_WORDS__, tmp->buffer ) ) {
			e = friso_dic_get( friso->dic, 
					__LEX_ECM_WORDS__, tmp->buffer );
		}
	}

	free_string_buffer( tmp );

	/* e is not NULL does't mean it must be EC mixed word.
	 * 	it could be an english and punctuation mixed word, like 'c++'
	 * But we don't need to check and set the START_SS_MASK mask here.
	 * */
	if ( e != NULL ) 
	{
		task->idx += (e->length - sb->length);
		free_string_buffer(sb);
		return e;
	}


	//no match for mix word, try to find a single unit.
	if ( chkunits 
			&& ( friso_numeric_string( friso->charset, sb->buffer ) 
				|| friso_decimal_string( friso->charset, sb->buffer ) ) ) 
	{
		idx = task->idx;
		if ( ( task->bytes = readNextWord( 
						friso, task, &idx, task->buffer ) ) != 0 ) {
			//check the single chinese units dictionary.
			if ( friso_dic_match( friso->dic, 
						__LEX_CJK_UNITS__, task->buffer ) )
			{
				fdunits = 1;
				string_buffer_append( sb, task->buffer );
				wlen += task->bytes;
				task->idx += task->bytes;
			}
		}
	}

	//set the START_SS_MASK.
	if ( fdunits != 1 && ssseg ) task_ssseg_open(task); 

	//create the lexicon entry and return it.
	e = new_lex_entry( string_buffer_devote(sb), 
			NULL, 0, sb->length, __LEX_OTHER_WORDS__  );
	e->rlen = wlen;

	return e;
}
/* }}} */

/* **************************************************************
 * 	mmseg algorithm implemented functions :: start  	*
 ****************************************************************/

/* {{{ get the next match from the current position,
 *		throught the dictionary this will return all the matchs.
 *
 * @return friso_array_t that contains all the matchs.
 */
__STATIC_API__ friso_array_t get_next_match( 
		friso_t friso, 
		friso_config_t config,
		friso_task_t task, 
		uint_t idx ) 
{
	register uint_t t;
	string_buffer_t sb = 
		new_string_buffer_with_string( task->buffer );

	//create a match dynamic array.
	friso_array_t match = 
		new_array_list_with_opacity( config->max_len );
	array_list_add( match, friso_dic_get( 
				friso->dic, __LEX_CJK_WORDS__, task->buffer ) );

	for ( t = 1; t < config->max_len && ( task->bytes = 
				readNextWord( friso, task, &idx, task->buffer ) ) != 0; t++ )  
	{
		if ( friso_whitespace( friso->charset, task ) )	break;
		if ( ! friso_cn_string( friso->charset, task ) ) break;

		//append the task->buffer to the buffer.
		string_buffer_append( sb, task->buffer );

		//check the CJK dictionary.
		if ( friso_dic_match( friso->dic, 
					__LEX_CJK_WORDS__, sb->buffer ) ) {
			/*
			 * add the lex_entry_t insite.
			 * here is a key point:
			 *		we use friso_dic_get function 
			 *		to get the address of the lex_entry_cdt
			 *		that store in the dictionary, 
			 *		not create a new lex_entry_cdt.
			 * so :
			 *		1.we will not bother to the allocations of 
			 *			the newly created lex_entry_cdt.
			 *		2.more efficient of course.
			 */
			array_list_add( match, friso_dic_get( 
						friso->dic, __LEX_CJK_WORDS__, sb->buffer ) );
		}
	}

	/*buffer allocations clear*/
	free_string_buffer( sb );
	//array_list_trim( match );

	return match;
}
/* }}} */

/* {{{ chunk for mmseg defines and functions to handle them.*/
typedef struct {
	friso_array_t words;
	uint_t length;
	float average_word_length;
	float word_length_variance;
	float single_word_dmf;
} friso_chunk_entry;
typedef friso_chunk_entry * friso_chunk_t;
/* }}} */

/* {{{ create a new chunks*/
__STATIC_API__ friso_chunk_t new_chunk( 
		friso_array_t words, uint_t length ) 
{
	friso_chunk_t chunk = ( friso_chunk_t ) 
		FRISO_MALLOC( sizeof( friso_chunk_entry ) );
	if ( chunk == NULL ) {
		___ALLOCATION_ERROR___
	}

	chunk->words = words;
	chunk->length = length;
	chunk->average_word_length = -1;
	chunk->word_length_variance = -1;
	chunk->single_word_dmf = -1;

	return chunk;
}
/* }}} */

/* {{{ free the specified chunk */
__STATIC_API__ void free_chunk( friso_chunk_t chunk ) 
{
	FRISO_FREE( chunk );
}
/* }}} */

/* {{{ a static function to count the average word length
 *	of the given chunk.
 */
__STATIC_API__ float count_chunk_avl( friso_chunk_t chunk ) 
{
	chunk->average_word_length = 
		((float) chunk->length) / chunk->words->length;
	return chunk->average_word_length;
}
/* }}} */

/* {{{ a static function to count the word length variance
 *	of the given chunk.
 */
__STATIC_API__ float count_chunk_var( friso_chunk_t chunk ) 
{
	float var = 0, tmp = 0;			//snapshot
	register uint_t t;
	lex_entry_t e;

	for ( t = 0; t < chunk->words->length; t++ ) {
		e = ( lex_entry_t ) chunk->words->items[t];
		tmp = e->length - chunk->average_word_length;
		var += tmp * tmp;
	}

	chunk->word_length_variance = var / chunk->words->length;

	return chunk->word_length_variance;
}
/* }}} */

/* {{{ a static function to count the single word morpheme degree of freedom
 *	of the given chunk.
 */
__STATIC_API__ float count_chunk_mdf( friso_chunk_t chunk ) 
{
	float __mdf__ = 0;
	register uint_t t;
	lex_entry_t e;

	for ( t = 0; t < chunk->words->length; t++ ) {
		e = ( lex_entry_t ) chunk->words->items[t];
		//single CJK(UTF-8)/chinese(GBK) word.
		//better add a charset check here, but this will works find.
		//all CJK words will take 3 bytes with UTF-8 encoding.
		//all chinese words take 2 bytes with GBK encoding.
		if ( e->length == 3 || e->length == 2 ) {	
			__mdf__ += (float) log( (float)e->fre);
		}
	}
	chunk->single_word_dmf = __mdf__;

	return chunk->single_word_dmf;
}
/* }}} */

/* {{{ chunk printer - use for for debug*/
#define ___CHUNK_PRINTER___( _chunks_ )						\
	for ( t = 0; t < _chunks_->length; t++ ) {					\
		__tmp__ = (( friso_chunk_t ) _chunks_->items[t])->words;		\
		for ( j = 0; j < __tmp__->length; j++ ) {				\
			printf("%s/ ", ( ( lex_entry_t ) __tmp__->items[j] )->word );	\
		}									\
		putchar('\n');								\
	}										\
putchar('\n');									\
/* }}} */

/* {{{ mmseg algorithm core invoke
 * here,
 * we use four rules to filter all the chunks to get the best chunk.
 *		and this is the core of the mmseg alogrithm.
 * 1. maximum match word length.
 * 2. larget average word length.
 * 3. smallest word length variance.
 * 4. largest single word morpheme degrees of freedom.
 */
__STATIC_API__ friso_chunk_t mmseg_core_invoke( friso_array_t chunks ) 
{
	register uint_t t/*, j*/;
	float max;
	friso_chunk_t e;
	friso_array_t __res__, __tmp__;
	__res__ = new_array_list_with_opacity( chunks->length );


	//1.get the maximum matched chunks.
	//count the maximum length
	max = ( float ) ( ( friso_chunk_t ) chunks->items[0] )->length;
	for ( t = 1; t < chunks->length; t++ ) {
		e = ( friso_chunk_t ) chunks->items[t];
		if ( e->length > max )
			max = ( float ) e->length;
	}
	//get the chunk items that owns the maximum length.
	for ( t = 0; t < chunks->length; t++ ) {
		e = ( friso_chunk_t ) chunks->items[t];
		if ( e->length >= max ) {
			array_list_add( __res__, e );
		} else {
			free_array_list( e->words );
			free_chunk( e );
		}
	}
	//check the left chunks
	if ( __res__->length == 1 ) {
		e = ( friso_chunk_t ) __res__->items[0];
		free_array_list( __res__ );
		free_array_list( chunks );
		return e;
	} else {
		__tmp__ = array_list_clear( chunks );
		chunks = __res__;
		__res__ = __tmp__;
	}


	//2.get the largest average word length chunks.
	//count the maximum average word length.
	max = count_chunk_avl( ( friso_chunk_t ) chunks->items[0] );
	for ( t = 1; t < chunks->length; t++ ) {
		e = ( friso_chunk_t ) chunks->items[t];
		if ( count_chunk_avl( e ) > max ) {
			max = e->average_word_length;
		}
	}
	//get the chunks items that own the largest average word length.
	for ( t = 0; t < chunks->length; t++ ) {
		e = ( friso_chunk_t ) chunks->items[t];
		if ( e->average_word_length >= max ) {
			array_list_add( __res__, e );
		} else {
			free_array_list( e->words );
			free_chunk( e );
		}
	}
	//check the left chunks
	if ( __res__->length == 1 ) {
		e = ( friso_chunk_t ) __res__->items[0];
		free_array_list( __res__);
		free_array_list( chunks );
		return e;
	} else {
		__tmp__ = array_list_clear( chunks );
		chunks = __res__;
		__res__ = __tmp__;
	}


	//3.get the smallest word length variance chunks
	//count the smallest word length variance
	max = count_chunk_var( ( friso_chunk_t ) chunks->items[0] );
	for ( t = 1; t < chunks->length; t++ ) {
		e = ( friso_chunk_t ) chunks->items[t];
		if ( count_chunk_var( e ) < max ) {
			max = e->word_length_variance;
		}
	}
	//get the chunks that own the smallest word length variance.
	for ( t = 0; t < chunks->length; t++ ) {
		e = ( friso_chunk_t ) chunks->items[t];
		if ( e->word_length_variance <= max ) {
			array_list_add( __res__, e );
		} else {
			free_array_list( e->words );
			free_chunk( e );
		}
	}
	//check the left chunks
	if ( __res__->length == 1 ) {
		e = ( friso_chunk_t ) __res__->items[0];
		free_array_list( chunks );
		free_array_list( __res__ );
		return e;
	} else {
		__tmp__ = array_list_clear( chunks );
		chunks = __res__;
		__res__ = __tmp__;
	}


	//4.get the largest single word morpheme degrees of freedom.
	//count the maximum single word morpheme degreees of freedom
	max = count_chunk_mdf( ( friso_chunk_t ) chunks->items[0] );
	for ( t = 1; t < chunks->length; t++ ) {
		e = ( friso_chunk_t ) chunks->items[t];
		if ( count_chunk_mdf( e ) > max ) {
			max = e->single_word_dmf;
		}
	} 
	//get the chunks that own the largest single word word morpheme degrees of freedom.
	for ( t = 0; t < chunks->length; t++ ) {
		e = ( friso_chunk_t ) chunks->items[t];
		if ( e->single_word_dmf >= max ) {
			array_list_add( __res__, e );
		} else {
			free_array_list( e->words );
			free_chunk( e );
		}
	}


	/*
	 * there is still more than one chunks?
	 *		well, this rarely happen but still happens.
	 * here we simple return the first chunk as the final result,
	 * 		and we need to free the all the chunks that __res__
	 * 	points to except the 1th one.
	 * you have to do two things to totaly free a chunk:
	 * 1. call free_array_list to free the allocations of a chunk's words.
	 * 2. call free_chunk to the free the allocations of a chunk. 
	 */
	for ( t = 1; t < __res__->length; t++ ) {
		e = ( friso_chunk_t ) __res__->items[t];
		free_array_list( e->words );
		free_chunk( e );
	}

	e = ( friso_chunk_t ) __res__->items[0];
	free_array_list( chunks );
	free_array_list( __res__ );

	return e;
}
/* }}} */

/* {{{ get the next cjk word from the current position with complex mode.
 *	this is the core of the mmseg chinese word segemetation algorithm.
 *	we use four rules to filter the matched chunks and get the best one
 *		as the final result.
 *
 * @see mmseg_core_invoke( chunks );
 */
__STATIC_API__ lex_entry_t next_complex_cjk( 
		friso_t friso,
		friso_config_t config,
		friso_task_t task ) 
{
	register uint_t x, y, z;
	/*bakup the task->bytes here*/
	uint_t __idx__ = task->bytes;
	lex_entry_t fe, se, te;
	friso_chunk_t e;
	friso_array_t words, chunks;
	friso_array_t smatch, tmatch, fmatch = 
		get_next_match( friso, config, task, task->idx );

	/*
	 * here:
	 *		if the length of the fmatch is 1, mean we don't have to
	 * continue the following work. ( no matter what we get the same result. )
	 */
	if ( fmatch->length == 1 ) 
	{
		fe = ( ( lex_entry_t ) fmatch->items[0] );
		free_array_list( fmatch );

		/*
		 * check and clear the stop words . 
		 * @date 2013-06-13
		 */
		if ( config->clr_stw && 
				friso_dic_match( friso->dic, 
					__LEX_STOPWORDS__, fe->word ) ) {
			return NULL;
		}

		return fe;
	}

	chunks = new_array_list();
	task->idx -= __idx__;


	for ( x = 0; x < fmatch->length; x++ ) 
	{
		/*get the word and try the second layer match*/
		fe = ( lex_entry_t ) array_list_get( fmatch, x );
		__idx__ = task->idx + fe->length;
		readNextWord( friso, task, &__idx__, task->buffer );

		if ( task->bytes != 0 
				&& friso_cn_string( friso->charset, task ) 
				&& friso_dic_match( friso->dic, 
					__LEX_CJK_WORDS__, task->buffer ) ) 
		{
			//get the next matchs
			smatch = get_next_match( friso, config, task, __idx__ );
			for ( y = 0; y < smatch->length; y++ ) 
			{
				/*get the word and try the third layer match*/
				se = ( lex_entry_t ) array_list_get( smatch, y );
				__idx__ = task->idx + fe->length + se->length;
				readNextWord( friso, task, &__idx__, task->buffer );

				if ( task->bytes != 0 
						&& friso_cn_string( friso->charset, task )
						&& friso_dic_match( friso->dic, 
							__LEX_CJK_WORDS__, task->buffer ) ) 
				{
					//get the matchs.
					tmatch = get_next_match( friso, config, task, __idx__ );
					for ( z = 0; z < tmatch->length; z++ ) 
					{
						te = ( lex_entry_t ) array_list_get( tmatch, z );
						words = new_array_list_with_opacity(3);
						array_list_add( words, fe );
						array_list_add( words, se );
						array_list_add( words, te );
						array_list_add( chunks, new_chunk( words, 
									fe->length + se->length + te->length ) );
					}
					//free the third matched array list
					free_array_list( tmatch );
				} 
				else 
				{
					words = new_array_list_with_opacity(2);
					array_list_add( words, fe );
					array_list_add( words, se );
					//add the chunk
					array_list_add( chunks,
							new_chunk( words, fe->length + se->length ) );
				}
			}
			//free the second match array list
			free_array_list( smatch );
		} 
		else 
		{
			words = new_array_list_with_opacity(1);
			array_list_add( words, fe );
			array_list_add( chunks, new_chunk( words, fe->length ) );
		}
	}
	//free the first match array list
	free_array_list( fmatch );

	/*
	 * filter the chunks with the four rules of the mmseg algorithm
	 *		and get best chunk as the final result.
	 *
	 * @see mmseg_core_invoke( chunks );
	 * @date 2012-12-13
	 */
	if ( chunks->length > 1 ) {
		e = mmseg_core_invoke( chunks );
	} else {
		e = ( friso_chunk_t ) chunks->items[0];
	}

	fe = ( lex_entry_t ) e->words->items[0];
	task->idx += fe->length;			//reset the idx of the task.
	free_array_list(e->words);			//free the chunks words allocation
	free_chunk( e );

	//clear the stop words
	if ( config->clr_stw && 
			friso_dic_match( friso->dic, 
				__LEX_STOPWORDS__, fe->word ) ) {
		return NULL;
	}

	return fe;
}
/* }}} */

/* **********************************************************
 * mmseg algorithm implemented functions : end  	    *
 ************************************************************/

/* {{{ A macro function to check and free
 * 	the lex_entry_t with type of __LEX_OTHER_WORDS__.
 */
#define check_free_otlex_entry( lex ) \
	do { \
		if ( lex->type == __LEX_OTHER_WORDS__ )	{	\
			FRISO_FREE( lex->word );				\
			free_lex_entry( lex );					\
		}\
	} while (0)
/* }}} */

/* {{{ sphinx style output synonyms words append.
 *
 * @param	task
 * @param	lex
 * */
__STATIC_API__ void hits_sphinx_output( 
		friso_task_t task, 
		lex_entry_t lex )
{
	uint_t i, j, len;
	fstring _word;
	len = lex->length;

	//append the synoyums words.
	for ( i = 0; i < lex->syn->length; i++  )
	{
		_word = ( fstring ) lex->syn->items[i];
		j = strlen(_word);
		if ( ( len + j + 1 ) >= __HITS_WORD_LENGTH__ ) break;
		memcpy(task->hits->word + len, "|", 1);
		len += 1;
		memcpy(task->hits->word + len, _word, j);
		len += j;
	} 

	//set the new end of the buffer.
	task->hits->word[len] = '\0';
}
/* }}} */

/* {{{ normal style output synonyms words append.
 *
 * @param	task
 * @param	lex
 * @param	front	1 for add the synoyum words from the head and 
 * 					0 for append from the tail.
 * */
__STATIC_API__ void hits_normal_output( 
		friso_task_t task,
		lex_entry_t lex, 
		int front )
{
	uint_t i;
	fstring _word;
	lex_entry_t e;

	for ( i = 0; i < lex->syn->length; i++  ) 
	{
		_word = ( fstring ) lex->syn->items[i];
		e = new_lex_entry( _word, NULL, 0, 	
				strlen(_word), __LEX_NCSYN_WORDS__ );
		e->offset = lex->offset;
		//add to the buffer.
		if ( front ) 
			link_list_add_first( task->pool, e );
		else link_list_add( task->pool, e);
	} 
}
/* }}} */

/* {{{ do the secondary segmentation of the complex english token.
 *
 * @param	friso
 * @param	config
 * @param	task
 * @param	lex
 * @param	retfw	-Wether to return the first word.
 * @return	lex_entry_t(NULL or the first sub token of the lex)
 */
__STATIC_API__ lex_entry_t en_second_seg( 
		friso_t friso,
		friso_config_t config,
		friso_task_t task,
		lex_entry_t lex, int retfw )
{
	//printf("sseg: %d\n", (task->ctrlMask & START_SS_MASK));

	int j, p = 0, start = 0;
	fstring str = lex->word;

	lex_entry_t fword = NULL, sword = NULL;

	int _ctype, _TYPE = get_enchar_type(str[0]);
	string_buffer_clear(task->sbuf);
	string_buffer_append_char(task->sbuf, str[0]);

	for ( j = 1; j < lex->length; j++ )
	{
		//get the type of the char
		_ctype = get_enchar_type(str[j]);
		if ( _ctype == FRISO_EN_WHITESPACE )
		{
			_TYPE = FRISO_EN_WHITESPACE;
			p++;
			continue;
		}

		if ( _ctype == _TYPE ) 
			string_buffer_append_char(task->sbuf, str[j]);
		else
		{
			start = j - task->sbuf->length - p;

			/* If the number of chars of current type
			 * 	is larger than config->st_minl then we will
			 * 	create a new lex_entry_t and append it to the task->wordPool.
			 * */
			if ( task->sbuf->length >= config->st_minl 
					&& ! ( config->clr_stw && friso_dic_match( friso->dic, 
							__LEX_STOPWORDS__, task->sbuf->buffer ) ) )
			{
				/* the allocation of lex_entry_t and its word 
				 * 	should be released and the type of the lex_entry_t
				 * 	must be __LEX_OTHER_WORDS__.
				 * */
				sword = new_lex_entry(strdup(task->sbuf->buffer), 
						NULL, 0, task->sbuf->length, __LEX_OTHER_WORDS__);
				sword->offset = lex->offset + start;
				if ( retfw && fword == NULL ) fword = sword;
				else link_list_add(task->pool, sword);
			}

			string_buffer_clear(task->sbuf);
			string_buffer_append_char(task->sbuf, str[j]);
			p = 0;
			_TYPE = _ctype;
		}
	}

	//continue to check the last item.
	if ( task->sbuf->length >= config->st_minl 
			&& ! ( config->clr_stw && friso_dic_match( friso->dic, 
					__LEX_STOPWORDS__, task->sbuf->buffer ) ) )
	{
		start = j - task->sbuf->length;
		sword = new_lex_entry(strdup(task->sbuf->buffer), 
				NULL, 0, task->sbuf->length, __LEX_OTHER_WORDS__);
		sword->offset = j - task->sbuf->length;
		if ( retfw && fword == NULL ) fword = sword;
		else link_list_add(task->pool, sword);
	}

	return fword;
}
/*}}}*/

/* {{{ english synoyums words check and append macro define.*/
#define append_en_syn( lex, tmp, front )\
	do {\
		if ( ( tmp = friso_dic_get(friso->dic,	\
						__LEX_EN_WORDS__, lex->word) ) != NULL	\
				&& (tmp->syn) != NULL ) 		\
		{\
			if ( config->spx_out == 1 )			\
			hits_sphinx_output(task, tmp);		\
			else \
			{\
				tmp->offset = lex->offset;		\
				hits_normal_output(task, tmp, front);	\
			}\
		}\
	} while (0)
/* }}} */

/* {{{ get the next segmentation.
 * 	and also this is the friso enterface function.
 *
 * @param 	friso.
 * @param	config.
 * @return	task.
 */
FRISO_API friso_hits_t friso_next( 
		friso_t friso, 
		friso_config_t config, 
		friso_task_t task ) 
{
	uint_t j, len = 0;
	string_buffer_t sb = NULL;
	lex_entry_t lex = NULL, tmp = NULL, sword = NULL;

	/* {{{ task word pool check */
	if ( ! link_list_empty( task->pool ) ) {
		/*
		 * load word from the word poll if it is not empty.
		 *  this will make the next word more convenient and efficient.
		 * 	often synonyms, newly created word will be stored in the poll.
		 */
		lex = ( lex_entry_t ) link_list_remove_first( task->pool );
		memcpy(task->hits->word, lex->word, lex->length);
		task->hits->type = lex->type;
		task->hits->length = lex->length;
		task->hits->rlen = lex->rlen;
		task->hits->offset = lex->offset;
		task->hits->word[lex->length] = '\0';

		/* check and handle the english synonyms words append mask.
		 * 	Also we have to close the mask after finish the operation.
		 *
		 * 1. we've check the config->add_syn before open the 
		 * 		_LEX_APPENSYN_MASK mask.
		 * 2. we should add the synonyms words of the curren 
		 * 		lex_entry_t from the head.
		 *
		 * @since: 1.6.0
		 * */
		if ( lex_appensyn_check(lex) ) 
		{
			lex_appensyn_close(lex);
			append_en_syn(lex, tmp, 1);
		}

		/*
		 * __LEX_NCSYN_WORDS__:
		 *  these lex_entry_t was created to store the the synonyums words.
		 * 	and its word pointed to the lex_entry_t's synonyms word of
		 * 		friso->dic, so :
		 * 	free the lex_entry_t but not its word here.
		 *
		 * __LEX_OTHER_WORDS__:
		 *  newly created lexicon entry, like the chinese and english mixed word.
		 * 	during the invoke of function next_basic_latin.
		 *
		 * other type:
		 *  they must exist in the dictionary, so just pass them.
		 */
		switch ( lex->type ) 
		{
			case __LEX_OTHER_WORDS__: 
				FRISO_FREE( lex->word );
				free_lex_entry( lex );
				break;
			case __LEX_NCSYN_WORDS__:
				free_lex_entry( lex );
				break;
		}

		return task->hits;
	}
	/* }}} */

	while ( task->idx < task->length ) 
	{
		//read the next word from the current position.
		task->bytes = readNextWord( friso, task, &task->idx, task->buffer );
		if ( task->bytes == 0 ) break;

		//clear up the whitespace.
		if ( friso_whitespace( friso->charset, task ) ) continue;

		/* {{{ CJK words recongnize block. */
		if ( friso_cn_string( friso->charset, task ) ) 
		{
			/* check the dictionary.
			 * and return the unrecognized CJK char as a single word.
			 * */
			if ( ! friso_dic_match( friso->dic, 
						__LEX_CJK_WORDS__, task->buffer) )
			{
				memcpy(task->hits->word, task->buffer, task->bytes );
				task->hits->type = __LEX_PUNC_WORDS__;
				task->hits->length = task->bytes;
				task->hits->rlen = task->bytes;
				task->hits->offset = task->idx - task->bytes;
				task->hits->word[(int)task->bytes] = '\0';
				return task->hits;
			}

			//complex mode.
			if ( config->mode == __FRISO_COMPLEX_MODE__ ) 
				lex = next_complex_cjk( friso, config, task );
			//simple  mode.
			else lex = next_simple_cjk( friso, config, task );

			if ( lex == NULL ) continue;	//find a stopwrod.
			lex->offset = task->idx - lex->rlen;

			/*
			 * try to find a chinese and english mixed words, like '卡拉ok'
			 * 	keep in mind that is not english and chinese mixed words
			 * 		like 'x射线'.
			 * 	
			 * @reader:
			 * 1. only if the char after the current word is an english char.
			 * 2. if the first point meet, friso will call next_basic_latin() to
			 * 		get the next basic latin. (yeah, you have to handle it).
			 * 3. if match a CE word, set lex to the newly match CE word.
			 * 4. if no match a CE word, we will have to append the basic latin
			 * 		to the pool, and it should after the append of synonyms words.
			 * 5. do not use the task->buffer and task->unicode as the check 
			 * 		condition for the CE word identify.
			 * 6. Add friso_numeric_letter check so can get work like '高3'
			 *
			 * @date 2013-09-02
			 */
			if ( ( task->idx < task->length ) 
					&& ((int)task->text[task->idx]) > 0 
					&& ( friso_en_letter( friso->charset, task ) 
						|| friso_numeric_letter(friso->charset, task) ) )
			{
				//create a string buffer
				sb = new_string_buffer_with_string(lex->word);

				//find the next basic latin.
				task->buffer[0] = task->text[task->idx++];
				task->buffer[1] = '\0';
				tmp = next_basic_latin(friso, config, task);
				tmp->offset = task->idx - tmp->length;
				string_buffer_append( sb, tmp->word );

				//check the CE dictionary.
				if ( friso_dic_match( friso->dic, 
							__LEX_CEM_WORDS__, sb->buffer ) )
				{
					j = lex->offset; //bakup the offset.
					lex = friso_dic_get( friso->dic, 
							__LEX_CEM_WORDS__, sb->buffer );
					lex->offset = j;
					check_free_otlex_entry(tmp);
					free_string_buffer(sb);
					tmp = NULL; sb = NULL;
				}
			}

			/*
			 * copy the lex_entry to the result hits
			 *
			 * @reader: (boodly lession, added 2013-08-31):
			 * 	don't bother to handle the task->hits->offset problem.
			 * 		is has been sovled perfectly above. 
			 */
			len = (int) lex->length;
			memcpy(task->hits->word, lex->word, lex->length);
			task->hits->type = lex->type;
			task->hits->length = lex->length;
			task->hits->rlen = lex->rlen;
			task->hits->offset = lex->offset;
			task->hits->word[len] = '\0';

			//check and append the synonyms words
			if ( config->add_syn && lex->syn != NULL ) 
			{
				if ( config->spx_out == 1 )
					hits_sphinx_output(task, lex);
				else hits_normal_output(task, lex, 0);
			}

			/* {{{ here: handle the newly found basic latin created when 
			 * we try to find a CE word.
			 *
			 * @reader:
			 * when tmp is not NULL and sb will not be NULL too
			 * 	except a CE word is found.
			 *
			 * @TODO: finished append the synonyms words on 23013-12-19.
			 */
			if ( tmp != NULL && sb != NULL ) 
			{
				//check the secondary split.
				if ( config->en_sseg == 1 
						&& task_ssseg_check(task) )
					en_second_seg(friso, config, task, tmp, 0);

				free_string_buffer( sb );
				link_list_add( task->pool, tmp );

				//check if append synoyums words.
				if ( config->add_syn == 1 ) lex_appensyn_open(tmp);

			}
			/* }}} */

			return task->hits;
		} 
		/* }}} */

		/* {{{ basic english/latin recongnize block. */
		else if ( friso_halfwidth_en_char( friso->charset, task ) 
				|| friso_fullwidth_en_char( friso->charset, task ) ) 
		{
			/*
			 * handle the english punctuation.
			 *
			 * @todo:
			 * 1. commen all the code of the following if 
			 * 	and uncomment the continue to clear up the punctuation directly.
			 *
			 * @reader: 
			 * 2. keep in mind that ALL the english punctuation will be handled here,
			 *  (when a english punctuation is found during the other process, we will
			 *  	reset the task->idx back to it and then back here)
			 * 	except the keep punctuation(define in file friso_string.c) 
			 * 	that will make up a word with the english chars around it.
			 */
			if ( friso_en_punctuation( friso->charset, task ) ) 
			{
				if ( config->clr_stw 
						&& friso_dic_match(friso->dic, 
							__LEX_STOPWORDS__, task->buffer) )
					continue;
				//count the punctuation in.
				task->hits->word[0] = task->buffer[0];
				task->hits->type = __LEX_PUNC_WORDS__;
				task->hits->length = task->bytes;
				task->hits->rlen = task->bytes;
				task->hits->offset = task->idx - task->bytes;
				task->hits->word[1] = '\0';
				return task->hits;

				//continue
			}	

			//get the next basic latin word.
			lex = next_basic_latin( friso, config, task );
			lex->offset = task->idx - lex->rlen;

			/* @added: 2013-12-22
			 * check and do the secondary segmentation work.
			 * this will split 'qq2013' to 'qq, 2013'
			 * */
			sword = NULL;
			if ( config->en_sseg == 1 
					&& task_ssseg_check(task) )
				sword = en_second_seg(friso, config, task, lex, 1);

			//check if it is a stopword.
			if ( config->clr_stw 
					&& friso_dic_match( friso->dic, 
						__LEX_STOPWORDS__, lex->word ) ) {
				//free the newly created lexicon entry.
				check_free_otlex_entry( lex );
				if ( sword == NULL ) continue;
				lex = sword;
			}
			else if ( sword != NULL )
			{
				if ( config->add_syn == 1 ) lex_appensyn_open(lex);
				link_list_add(task->pool, lex);

				/* If the sub token is not NULL:
				 * add the lex to the task->pool if it is not NULL 
				 * and return the sub token istead of lex so
				 * 	the sub tokens will be output ahead of lex.
				 * */
				lex = sword;
			}

			//if the token is longer than __HITS_WORD_LENGTH__, drop it 
			//copy the word to the task hits buffer.
			if ( lex->length >= __HITS_WORD_LENGTH__ ) continue;
			memcpy(task->hits->word, lex->word, lex->length);
			task->hits->type = lex->type;
			task->hits->length = lex->length;
			task->hits->rlen = lex->rlen;
			task->hits->offset = lex->offset;
			task->hits->word[lex->length] = '\0';

			/* If sword is NULL, continue to check and append 
			 * tye synoyums words for the current lex_entry_t.
			 * */
			if ( sword == NULL 
					&& config->add_syn == 1 ) append_en_syn(lex, tmp, 0);

			//free the newly create lex_entry_t
			check_free_otlex_entry( lex );

			return task->hits;
		} 
		/* }}} */

		/* {{{ Keep the chinese punctuation.
		 * @added 2013-08-31) */
		else if ( friso_cn_punctuation( friso->charset, task ) )
		{
			if ( config->clr_stw 
					&& friso_dic_match(friso->dic, 
						__LEX_STOPWORDS__, task->buffer) )
				continue;
			//count the punctuation in.
			memcpy(task->hits->word, task->buffer, task->bytes);
			task->hits->type = __LEX_PUNC_WORDS__;
			task->hits->length = task->bytes;
			task->hits->offset = task->idx - task->bytes;
			task->hits->word[task->bytes] = '\0';
			return task->hits;
		}
		/* }}} */
		//else if ( friso_letter_number( friso->charset, task ) ) 
		//{
		//} 
		//else if ( friso_other_number( friso->charset, task ) ) 
		//{
		//}

		/* {{{ keep the unrecognized words?
		//@date 2013-10-14 */
		else if ( config->keep_urec ) 
		{
			memcpy(task->hits->word, task->buffer, task->bytes);
			task->hits->type = __LEX_UNKNOW_WORDS__;
			task->hits->length = task->bytes;
			task->hits->offset = task->idx - task->bytes;
			task->hits->word[task->bytes] = '\0';
			return task->hits;
		}
		/* }}} */
	}

	return NULL;
}
/* }}} */
